# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# for pandas profiling
import pandas_profiling as pp
dataset = pd.read_csv('data/Car details v3.csv')
dataset.head(5)
dataset.shape
dataset.info()
dataset.describe()
profile = pp.ProfileReport(dataset)
profile.to_notebook_iframe()
dataset.isnull().sum()
# Cleaning Mileage column
# filling nulls with zero values
dataset['mileage'] = dataset['mileage'].fillna('0 kmpl')
# splitting numeric data
mileage=[]
for i in dataset['mileage'].str.split(' '):
mileage.append(float(i[0]))
# storing the numeric data
dataset['mileage'] = mileage
# restoring the nulls in the data
dataset['mileage'] = dataset['mileage'].replace({0:np.nan})
# sanity check
dataset['mileage']
# Cleaning Engine column
# filling nulls with zero values
dataset['engine'] = dataset['engine'].fillna('0 CC')
# splitting numeric data
engine=[]
for i in dataset['engine'].str.split(' '):
engine.append(int(i[0]))
# storing the numeric data
dataset['engine'] = engine
# restoring the nulls in the data
dataset['engine'] = dataset['engine'].replace({0:np.nan})
# sanity check
dataset['engine']
# Cleaning max_power column
# filling nulls with zero values
dataset['max_power'] = dataset['max_power'].replace({'0':'0 bhp',' bhp':'0 bhp',np.nan:'0 bhp'})
# splitting numeric data
power=[]
for i in dataset['max_power'].str.split(' '):
power.append(float(i[0]))
# storing the numeric data
dataset['max_power'] = power
# restoring the nulls in the data
dataset['max_power'] = dataset['max_power'].replace({0:np.nan})
# sanity check
dataset['max_power']
# creating a new columns with only car brand and model name
dataset['car'] = [' '.join(i[:2]) for i in dataset['name'].str.split(' ')]
# sanity check
dataset['car']
# Dropping torque column
dataset.drop(columns=['torque'],inplace=True)
dataset.drop(columns=['name'],inplace=True)
dataset.isnull().sum()
null_cars = dataset.groupby('car').mean()['mileage'].sort_values().tail(11).index
x = pd.DataFrame()
for i in null_cars:
x=pd.concat([x,dataset[dataset['car']==i]],axis=0)
dataset = dataset.drop(index=[31, 5005, 7538, 1347, 1216, 1432, 1815, 2867, 3922, 5385, 5451,
316, 535, 1933, 6516, 7275, 7528, 7337, 6730, 3310, 2959, 3456]).reset_index(drop=True)
# directly filling these columns with the respective means in there by the grouping of car
dataset[['year', 'selling_price', 'km_driven', 'mileage', 'engine', 'max_power',
'seats']]=dataset.groupby("car").transform(lambda x: x.fillna(x.mean()))
# sanity check
dataset.isnull().sum()
dataset.dtypes
# Converting seats to categorical column
dataset['seats'] = dataset['seats'].astype('O')
# Creating a new column Age from Year
dataset['Age'] = 2021 - dataset['year']
# dropping years col
dataset.drop(columns=['year'],inplace=True)
# Separating numerical and categorical columns
df_int = dataset.select_dtypes(include=np.number)
df_cat = dataset.select_dtypes(include='O')
# Correlation matrix
sns.heatmap(df_int.corr(),annot=True,cmap=plt.cm.Blues_r)
plt.show()
for i in df_int.columns:
plt.boxplot(df_int[i])
plt.xlabel(i)
plt.show()
df_int.describe().loc['mean']
sns.scatterplot(data = dataset,x = 'selling_price',y= 'mileage',hue = 'fuel') #
plt.grid()
plt.show()
plt.figure(figsize=(15,8))
sns.stripplot(y=dataset['selling_price'],x=dataset['fuel'])
plt.grid()
plt.show()
plt.figure(figsize=(15,8))
sns.stripplot(y=dataset['selling_price'],x=dataset['owner'])
plt.grid()
plt.show()
plt.figure(figsize=(12,8))
df_int.groupby(['Age']).mean()['selling_price'].plot(kind='line',grid=True,label='Selling_price')
df_int.groupby(['Age']).mean()['km_driven'].plot(kind='line',grid=True)
plt.legend()
plt.show()
from warnings import filterwarnings
filterwarnings('ignore')
for i in df_cat.iloc[:,:-1]:
plt.figure(figsize=(10,7))
sns.countplot(df_cat[i])
plt.show()
sns.countplot(data = dataset,x='fuel')
plt.show()
dataset['seats']=dataset['seats'].map(lambda x: round(x))
pd.crosstab(dataset['fuel'],dataset['seller_type']).plot(kind='bar',figsize=(12,8))
plt.show()
pd.crosstab(dataset['fuel'],dataset['transmission']).plot(kind='bar',figsize=(12,8))
plt.show()
pd.crosstab(dataset['fuel'],dataset['owner']).plot(kind='bar',figsize=(12,8))
plt.show()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label_cat = pd.DataFrame()
for i in df_cat.columns:
label_cat[i] = le.fit_transform(df_cat[i])
X_base = pd.concat([label_cat,df_int],axis=1)
X_base
y = X_base['selling_price'].values
X_base.drop(columns=['selling_price'],inplace=True)
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X_base,y,random_state=8)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_train,y_train)
lr.score(X_test, y_test)
import statsmodels.api as sm
model = sm.OLS(y_train,sm.add_constant(X_train)).fit()
print(model.summary())
residual = model.resid
# Ho: Predictor variable is Normal
# H1: Predictor variable is not Normal
from scipy.stats import jarque_bera
jarque_bera(residual)
fitted = model.fittedvalues
sns.scatterplot(fitted,residual)
plt.show()
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["feature"] = X_base.columns
vif_data["VIF"] = [variance_inflation_factor(X_base.values, i)
for i in range(len(X_base.columns))]
print(vif_data)
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)
dtr.score(X_train,y_train)
dtr.score(X_test,y_test)
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
rfr.score(X_train,y_train)
rfr.score(X_test,y_test)
# installing xgboost
!pip3 install xgboost
# XGBoost Regressor
import xgboost as xgb
xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror',
colsample_bytree = 0.3,
learning_rate = 0.1,
max_depth = 10,
alpha = 1,
n_estimators = 250)
xgb_reg.fit(X_train,y_train)
xgb_reg.score(X_train,y_train)
xgb_reg.score(X_test,y_test)